##########
# Crossing Borders: Cleaning CBW data
#########


library(here)
library(data.table)
library(haven)

cbw <- read_dta(here("data", "cleaning", "CBWregister", "ggs_grenzgaengerstatistik_1996_2016_quarterly.dta")); setDT(cbw)

head(cbw)


# clean up year
setnames(cbw, old = "year", new = "year_month")
set(cbw, j = "year", value = signif(cbw$year_month, 4)/100)

# merge in the 2019 era BFS numbers
cbw[cw[year>=1996], on = c("arbgdneu==bfs"), bfs19 := bfs19]

# What are the country codes for border countries:
# 207 Deutschland
# 212 France
# 218 Italy
# 229 Austria
# 222 Liechtenstein

cbwYear <- cbw[ , .(
  cbw = .N/uniqueN(year_month),
  cbwDE = sum(nat == 207)/uniqueN(year_month),
  cbwFR = sum(nat == 212)/uniqueN(year_month),
  cbwIT = sum(nat == 218)/uniqueN(year_month),
  cbwOE = sum(nat %in% c(229, 222))/uniqueN(year_month)
  ), keyby = .(year, bfs19)] #divide by four because each year has four observations--> this gives us the average.

set(cbwYear, NULL, "cbwOther", with(cbwYear, cbw - cbwDE - cbwFR - cbwIT - cbwOE))

for(y in 1996:2016){
  missing <- setdiff_dw(cbwYear[year==y, bfs19], unique(cw$bfs19))$in_y_not_x
  
  if(length(missing) != 0){
    message(length(missing), " municipalities are not found in the CBW register in ", y,". Imputing them as having 0 CBW.")
    
    replacement <- data.table(matrix(0L, nrow = length(missing), ncol = length(cbwYear)))
    setnames(replacement, colnames(cbwYear))
    set(replacement, j = "bfs19", value = missing)
    set(replacement, j = "year", value = rep(y, nrow(replacement)))
    
    cbwYear <- rbindlist(list(cbwYear, replacement))
  }
}

cbwYearMonth <- cbw[ , .(cbw = .N), keyby = .(year_month, bfs19)]

for(y in unique(cbwYearMonth$year_month)){
  missing <- setdiff_dw(cbwYearMonth[year_month==y, bfs19], unique(cw$bfs19))$in_y_not_x
  
  if(length(missing) != 0){
    # message(length(missing), " municipalities are not found in the CBW register in ", y,". Imputing them as having 0 CBW.")
    
    replacement <- data.table(matrix(0L, nrow = length(missing), ncol = 3))
    setnames(replacement, colnames(cbwYearMonth))
    set(replacement, j = "bfs19", value = missing)
    set(replacement, j = "year_month", value = rep(y, nrow(replacement)))
    
    cbwYearMonth <- rbind(cbwYearMonth, replacement)
  }
}


message(
  "You've created the following data.tables:
  * cbwYear: contains the number of CBW per bfs19-year dyad
  * cbwYearMonth: contains the number of CBW per bfs19-quarter dyad"
)

rm(cbw)